In [1]:
# import sys
# import os
# import glob
import numpy as np
import scipy.interpolate
import pandas as pd
# import matplotlib as mpl
# import matplotlib.pyplot as plt
# from mpl_toolkits.mplot3d import Axes3D
# import mpl_toolkits.axisartist
# from mpl_toolkits.axes_grid1 import host_subplot
# import dateutil
# import statsmodels.formula.api as smformula
# from collections import OrderedDict
# from IPython.display import display, HTML
# %matplotlib notebook
# plt.style.use('seaborn-paper')
# special-ish extensions
In [2]:
def tempF2C(x):
return (x-32.0)*5.0/9.0
def tempC2F(x):
return (x*9.0/5.0)+32.0
In [2]:
def compute_year_over_year_norm(in_dataframe,
start, end,
norm_start=None, norm_end=None,
freq='daily',
interp_method='linear',
quiet=False):
"""
Will propagate the timezone of in_dataframe.index to the return values
Parameters
----------
start: convertable to Datetime
start range of dates to output
end: convertable to Datetime
end range of dates to output
norm_start : convertable to Datetime or None
`None` will use in_dataframe.index[0]
norm_end : convertable to Datetime or None
if given (not None), output range does not include `norm_end` (it is half-open)
`None` will use in_dataframe.index[-1]
freq : {'daily', 'hourly'}
interp_method : str or None
`None` will skip resample and interpolation, so
`in_dataframe` must already be daily or hourly (depending on `freq`)!
Returns
-------
norm, resampled_in_dataframe
Example
-------
tempnorm, tempresamp = compute_year_over_year_norm(tempdf,
START_DATE, END_DATE,
norm_end=END_DATE,
freq='hourly',
interp_method='linear',
quiet=False)
"""
if freq == 'hourly':
hrs = 24
hrs_freq = '1h'
elif freq == 'daily':
hrs = 1
hrs_freq = '24h'
else:
raise ValueError("Invalid `freq` argument value: {}".format(freq))
if norm_start is None:
norm_start = in_dataframe.index[0]
else:
norm_start = pd.to_datetime([norm_start])[0].tz_localize(in_dataframe.index.tz)# - pd.Timedelta('1 second')
if norm_end is None:
norm_end = in_dataframe.index[-1]
else:
norm_end = pd.to_datetime([norm_end])[0].tz_localize(in_dataframe.index.tz)# - pd.Timedelta('1 second')
if not quiet:
print('Computing using range:', norm_start, 'to', norm_end)
if interp_method is None: # skip resample+interpolation (assumes in_dataframe is daily!)
t = in_dataframe.loc[norm_start:norm_end]
else: # resample and interpolate to get hourly
# pandas resample: simple but might require upsampling in some cases
# t = in_dataframe.resample(hrs_freq).interpolate(method=interp_method).loc[norm_start:norm_end]
# using scipy.interpolate.interp1d: seems safer / more reliable than pandas resample
nidx = pd.date_range(norm_start, norm_end, freq=hrs_freq)
if isinstance(in_dataframe, pd.DataFrame):
t = pd.DataFrame([], index=nidx)
for col in in_dataframe:
t[col] = scipy.interpolate.interp1d(in_dataframe.index.astype('int64').values,
in_dataframe[col].values,
kind=interp_method,
fill_value=np.nan, #(0,1)
bounds_error=False)(t.index.astype('int64').values)
elif isinstance(in_dataframe, pd.Series):
t = pd.Series(scipy.interpolate.interp1d(
in_dataframe.index.astype('int64').values,
in_dataframe.values,
kind=interp_method,
fill_value=np.nan, #(0,1)
bounds_error=False)(nidx.astype('int64').values),
index=nidx)
else:
assert False, "in_dataframe must be a pandas.DataFrame or pandas.Series"
# actual norm calculation
norm = t.groupby([t.index.month, t.index.day, t.index.hour]).mean().sort_index()
# now replicate and trim to the desired output range
start = pd.to_datetime(start).tz_localize(in_dataframe.index.tz)
end = pd.to_datetime(end).tz_localize(in_dataframe.index.tz)
# need a non-leapyear and leapyear version
norm_ly = norm.copy()
if norm.shape[0] == 366*hrs:
norm = norm.drop((2,29,))
else: # norm doesn't include any leapyear data
assert norm.shape[0] == 365*hrs
# make Feb 29 the mean of Feb 28 and Mar 1
foo = (norm.loc[(2,28,)] + norm.loc[(3,1,)]) / 2.0
foo.index = pd.MultiIndex.from_product( ([2],[29],list(range(hrs))) )
norm_ly = pd.concat((norm_ly,foo)).sort_index()
norm_ly.sort_index(inplace=True) # probably not needed
# build up a 'long normal' (lnorm) dataframe year by year by appending the norm or norm_ly
lnorm = None
for yr in np.arange(start.year, end.year+1):
idx = pd.date_range(start='{}-{:02d}-{:02d} {:02d}:00:00'.format(yr,*norm.index[0]),
end= '{}-{:02d}-{:02d} {:02d}:00:00'.format(yr,*norm.index[-1]),
freq=hrs_freq,
tz=in_dataframe.index.tz)
if idx.shape[0] == 366*hrs:
foo = norm_ly.copy()
else:
assert norm.shape[0] == 365*hrs
foo = norm.copy()
foo.index = idx
if lnorm is None:
lnorm = foo
else:
lnorm = lnorm.append(foo)
return lnorm.loc[start:end], t
In [ ]:
In [4]:
# Used internally
def compute_threshold_passings(ddvals, cumulative_DD_threshold):
"""Find the index where cumsum of ddvals starting at each index passes the cumulative_DD_threshold
ddvals needs to be a pandas Series or DataFrame column"""
crossing_idx_rv = np.zeros(len(ddvals))*np.nan
cDD = ddvals.cumsum(skipna=True)
crossing_idx_rv = np.searchsorted(cDD, cDD+cumulative_DD_threshold-ddvals, side='left').astype(float)
crossing_idx_rv[crossing_idx_rv==len(crossing_idx_rv)] = np.nan
#crossing_idx_rv[np.array(pd.isnull(ddvals),dtype=bool)] = np.nan # @TCC Does not propagate nan correctly
return crossing_idx_rv #, cdd_rv
In [5]:
def _ddvals_to_outdf(dd, dd_threshold):
""" used internally
prepare nice output dataframe from ddvals
"""
# Find the point where cumulative sums of degree days cross the threshold
dd['idx'] = compute_threshold_passings(dd['DD'], dd_threshold)
# convert those indexes into end times
i = dd['idx'] # just convinience
e = pd.Series(index=dd.index, dtype='datetime64[ns]')
e[i.notnull()] = dd.index[i[i.notnull()].astype(int)]
dd['end'] = e
# and duration...
dd['dur'] = dd['end']-dd.index+pd.Timedelta(days=1)
dd['dur_days'] = dd['dur'].apply(lambda x: np.nan if pd.isnull(x) else x.days)
return dd
In [ ]:
In [6]:
def compute_summation_DD_from_hourly_df(temperature_df, base_temp, dd_threshold):
"""hourly summation degree-day method
to compute when a cumulative threshold has been passed
given hourly temperature data.
NOTE: Will actually work with non-hourly and non-equally-spaced data,
but accuracy will be lower for lower frequency and does not handle readings which cross midnight properly.
"""
# so we don't accidently mess up the input dataframe/series
t = pd.DataFrame(data=temperature_df.copy(deep=True), index=temperature_df.index)
# difference in datatime in units of (float) hours
t['dDays'] = t.index
t['dDays'] = t['dDays'].diff().dt.total_seconds() / (24*60*60)
# temp above threshold (or 0 if below)
t['dAT'] = t['AT']-DD_BASE_TEMP
t.loc[t['dAT'] < 0, 'dAT'] = 0
# thermal accumulation for each time period
t['DD'] = t['dAT']*t['dDays']
# compute the degree-days for each day
dd = t['DD'].groupby(pd.TimeGrouper('D')).sum().to_frame(name='DD')
return _ddvals_to_outdf(dd, dd_threshold)
In [7]:
def compute_single_triangle_DD_from_hourly_df(temperature_df, base_temp, dd_threshold):
"""single triangle degree-day method
to compute when a cumulative threshold has been passed
given hourly temperature data.
"""
# so we don't accidently mess up the input dataframe/series
t = pd.DataFrame(data=temperature_df.copy(deep=True), index=temperature_df.index)
# compute the degree-days for each day in the temperature input
dfgrp = t.groupby(pd.TimeGrouper('D'))
dd = dfgrp.agg(lambda x: _compute_daily_SingleTri_DD_from_hourly_groupby(x, base_temp))
dd.columns = ['DD']
return _ddvals_to_outdf(dd, dd_threshold)
# Used internally
def _compute_daily_SingleTri_DD(mint, maxt, avet, base_temp):
"""Use standard Baskerville-Ermin (single sine) degree-day method
to compute the degree-day values for each a single day.
"""
dd = np.nan # value which we're computing
# Adjust for observation time; not relevant
# if max < base (curve all below base)
if maxt <= base_temp:
dd = 0
# min <= base; tri from max to base_temp
elif mint <= base_temp:
dd = (maxt-base_temp)*(maxt-base_temp)*(0.5/(maxt-mint))
# else whole tri extends above; tri from max to min + rect from min to base
else:
assert mint > base_temp
dd = (maxt-mint)/2.0 + (mint-base_temp)
return dd
def _compute_daily_SingleTri_DD_from_hourly_groupby(grp, base_temp):
if grp.isnull().values.any():
return np.nan
mint = np.min(grp)
maxt = np.max(grp)
avet = (mint+maxt)/2.0 # simple midpoint (like in the refs)
assert not np.isnan(mint)
return _compute_daily_SingleTri_DD(
mint=mint,
maxt=maxt,
avet=avet,
base_temp=base_temp)
In [8]:
def compute_BM_DD_from_hourly_df(temperature_df, base_temp, dd_threshold):
"""Use standard Baskerville-Ermin (single sine) degree-day method
to compute when a cumulative threshold has been passed
given hourly temperature data.
(though its value is computed using daily min & max temperatures)
"""
# so we don't accidently mess up the input dataframe/series
t = pd.DataFrame(data=temperature_df.copy(deep=True), index=temperature_df.index)
# compute the degree-days for each day in the temperature input
dfgrp = t.groupby(pd.TimeGrouper('D'))
dd = dfgrp.agg(lambda x: _compute_daily_BM_DD_from_hourly_groupby(x, base_temp))
dd.columns = ['DD']
return _ddvals_to_outdf(dd, dd_threshold)
def compute_BM_DD_from_daily_min_and_max(mint, maxt, base_temp, dd_threshold):
dd = pd.DataFrame({'minT':mint, 'maxT':maxt}, index=mint.index)
dd['DD'] = pd.concat([mint,maxt], axis=1).apply(lambda x:
_compute_daily_BM_DD(x[0], x[1], (x[0]+x[1])/2.0, DD_BASE_TEMP),
axis=1)
return _ddvals_to_outdf(dd, dd_threshold)
# Used internally
def _compute_daily_BM_DD(mint, maxt, avet, base_temp):
"""Use standard Baskerville-Ermin (single sine) degree-day method
to compute the degree-day values for each a single day.
"""
dd = np.nan # value which we're computing
# Step 1: Adjust for observation time; not relevant
# Step 2: GDD = 0 if max < base (curve all below base)
if maxt < base_temp:
dd = 0
# Step 3: Calc mean temp for day; already done previously
# Step 4: min > base; then whole curve counts
elif mint >= base_temp:
dd = avet - base_temp
# Step 5: else use curve minus part below base
else:
W = (maxt-mint)/2.0
tmp = (base_temp-avet) / W
if tmp < -1:
print('WARNING: (base_temp-avet)/W = {} : should be [-1:1]'.format(tmp))
tmp = -1
if tmp > 1:
print('WARNING: (base_temp-avet)/W = {} : should be [-1:1]'.format(tmp))
tmp = 1
A = np.arcsin(tmp)
dd = ((W*np.cos(A))-((base_temp-avet)*((np.pi/2.0)-A)))/np.pi
return dd
def _compute_daily_BM_DD_from_hourly_groupby(grp, base_temp):
mint = np.min(grp)
maxt = np.max(grp)
avet = (mint+maxt)/2.0 # simple midpoint (like in the refs)
return _compute_daily_BM_DD(
mint=mint,
maxt=maxt,
avet=avet,
base_temp=base_temp)
In [ ]:
In [9]:
## Cython version of compute_threshold_passings
## might be slightly faster (numpy.searchsorted is pretty good though)
## However, makes portablility more complicated
# %load_ext Cython
In [10]:
# %%cython
# # Used internally
# import numpy as np
# cimport numpy as np
# import cython
# from libc.math cimport isnan
# def compute_threshold_passings(double[:] ddvals, double cumulative_DD_threshold):
# cdef double cDD
# cdef int found
# cdef double[:] crossing_idx_rv = np.zeros(len(ddvals))*np.nan
# #cdef double[:] cdd_rv = np.zeros(len(ddvals))*np.nan
# # @TCC -- Probably don't need to do this for all possible start dates...
# for i in range(len(ddvals)):
# cDD = 0
# found = 0
# for j in range(i,len(ddvals)):
# #print(start_d.Index, d.Index, cDD)
# if isnan(ddvals[i]):
# crossing_idx_rv[i] = np.nan
# found = 1 # we might find more later
# break
# else:
# cDD += ddvals[j]
# if cDD > cumulative_DD_threshold:
# found = 1
# crossing_idx_rv[i] = j
# #dd_rv[i] = cDD # @TCC Not really needed
# break
# if found != 1: # won't find any more either
# break
# return crossing_idx_rv #, cdd_rv
In [ ]:
In [ ]:
In [ ]:
In [ ]: